# necessary imports
import os
import pandas as pd
import numpy as np
import glob
import plotly.express as px
from scipy import signal
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('../scripts/')
from kolzur_filter import kz_filter
# set to *.csv to process all
path_to_csv = '../../data/BTW17_Twitter/hashtags/*.csv'
file_list = glob.glob(path_to_csv)
df = pd.DataFrame()
# concatenate dataframes
for index in tqdm(range(len(file_list))):
file = file_list[index]
# read file to df if first file
if index==0:
df = pd.read_csv(file)
df.drop('Unnamed: 0', axis=1, inplace=True)
# append to df if not first file
else:
df2 = pd.read_csv(file)
df2.drop('Unnamed: 0', axis=1, inplace=True)
df = df.append(df2)
df = df.groupby(['date', 'hashtag'], as_index=False).sum('count')
df.describe(include='all')
# plot top 25 hashtags
top25 = df[['hashtag','count']].groupby('hashtag', as_index=False).sum('count').nlargest(columns='count', n=25)
df.sort_values(by='date', inplace=True)
fig1 = px.line(df[df['hashtag'].isin(top25['hashtag'])], x='date', y='count', color='hashtag', title='top25 hashtags',
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig1.show()
def plot_peak_detection(hashtag, k):
wavelets = df[df['hashtag']==hashtag][['date', 'count']]
filtered_wavelets = [0] * len(wavelets)
half_k = int(k/2)
filtered_wavelets[half_k:-half_k] = kz_filter(wavelets['count'].to_numpy(), k, 1)
wavelets['filtered_count'] = filtered_wavelets
results_prom = []
for i in range(1,11):
peakind_loop = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/i))
prominences = signal.peak_prominences(filtered_wavelets, peakind_loop)
results_prom.append(prominences[0].mean())
id_max_prom = results_prom.index(max(results_prom)) + 1
peakind = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/id_max_prom))
fig = px.line(wavelets, x='date', y=['count', 'filtered_count'], title=hashtag,
template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
for item in peakind:
peak = wavelets['date'].tolist()[item]
fig.add_vrect(x0=str(datetime.strptime(peak, '%Y-%m-%d').date() - timedelta(days=3)),
x1=str(datetime.strptime(peak, '%Y-%m-%d').date() + timedelta(days=3)),
line_width=0,
fillcolor='grey',
opacity=0.2)
print(f'\nmean peak prom: {prominences[0].mean()}')
fig.show()
for hashtag in top25['hashtag']:
plot_peak_detection(hashtag, 7)
def peak_detection(hashtag, k):
wavelets = df[df['hashtag']==hashtag][['date', 'count']]
filtered_wavelets = [0] * len(wavelets)
half_k = int(k/2)
filtered_wavelets[half_k:-half_k] = kz_filter(wavelets['count'].to_numpy(), k, 1)
wavelets['filtered_count'] = filtered_wavelets
results_prom = []
for i in range(1,11):
peakind_loop = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/i))
prominences = signal.peak_prominences(filtered_wavelets, peakind_loop)
results_prom.append(prominences[0].mean())
id_max_prom = results_prom.index(max(results_prom)) + 1
peakind = signal.find_peaks_cwt(filtered_wavelets, np.arange(1, len(wavelets)/id_max_prom))
return(peakind)
peak_df = pd.DataFrame(columns=['peak', 'hashtag'])
hashtag_list = df['hashtag'].unique().tolist()
# remove incomplete time series
num_days = df['date'].nunique()/2 # min 60 days
complete_hashtags = []
for index in tqdm(range(len(hashtag_list))):
hashtag = hashtag_list[index]
if df[df['hashtag']==hashtag]['date'].nunique() >= num_days:
complete_hashtags.append(hashtag)
df_clean = df[df['hashtag'].isin(complete_hashtags)]
# get peak indices
for index in tqdm(range(len(complete_hashtags))):
hashtag = complete_hashtags[index]
results = peak_detection(hashtag, 7)
if index == 1:
peak_df['peak'] = pd.Series(results)
peak_df['hashtag'] = hashtag
else:
for item in results:
peak_df = peak_df.append({'peak': item, 'hashtag': hashtag}, ignore_index=True)
peak_df.dropna(inplace=True)
# save to csv
path_file = '../../data/BTW17_Twitter/peaks/peaks.csv'
peak_df.to_csv(path_file)